`%>%` <- magrittr::`%>%`
Definition of delta statistic
stat_auc <- function(x, y) {
measure <- c(x, y)
classes <- c(rep("X", length(x)), rep("Y", length(y)))
return(rocauc::auc_by(measure, classes, "Y") - 0.5)
}
apply_stat <- function(dx, dy, var, stat) return(stat(dx[[var]], dy[[var]]))
Plot of English statistics for segments attested with frequency >= 5
## Warning: textfont.color doesn't (yet) support data arrays
## Warning: textfont.color doesn't (yet) support data arrays
Number of potential consonants by language
ncons_by_lang <- added_consonants %>%
dplyr::select_at(dplyr::vars(-labels, -freq, -scores)) %>%
apply(2, sum) %>%
(function(x) tibble::tibble(nsegs=x, language=names(x)))
Top N languages
N_LANG <- 10
dplyr::arrange(ncons_by_lang, -nsegs) %>% head(N_LANG) %>% print
## # A tibble: 10 x 2
## nsegs language
## <dbl> <chr>
## 1 61 uby
## 2 51 nmn
## 3 49 ady
## 4 38 gdo
## 5 38 mrt
## 6 37 sna
## 7 36 tkr
## 8 35 lez
## 9 33 ven
## 10 31 kbd
## [[1]]
## stat_econ stat_loc stat_glob
## stat_econ 1.0000000 0.0764968 -0.4032222
## stat_loc 0.0764968 1.0000000 0.1912467
## stat_glob -0.4032222 0.1912467 1.0000000
##
## [[2]]
## stat_econ stat_loc stat_glob
## stat_econ 1.00000000 -0.09232765 -0.3826825
## stat_loc -0.09232765 1.00000000 0.2879037
## stat_glob -0.38268249 0.28790375 1.0000000
##
## [[3]]
## stat_econ stat_loc stat_glob
## stat_econ 1.00000000 0.09481393 -0.4019823
## stat_loc 0.09481393 1.00000000 0.2743994
## stat_glob -0.40198228 0.27439944 1.0000000
##
## [[4]]
## stat_econ stat_loc stat_glob
## stat_econ 1.0000000 0.0540669 -0.4253403
## stat_loc 0.0540669 1.0000000 -0.2374706
## stat_glob -0.4253403 -0.2374706 1.0000000
##
## [[5]]
## stat_econ stat_loc stat_glob
## stat_econ 1.00000000 -0.03807736 -0.4469134
## stat_loc -0.03807736 1.00000000 0.1637639
## stat_glob -0.44691344 0.16376394 1.0000000
##
## [[6]]
## stat_econ stat_loc stat_glob
## stat_econ 1.0000000 0.2734516 -0.2668327
## stat_loc 0.2734516 1.0000000 0.3819063
## stat_glob -0.2668327 0.3819063 1.0000000
##
## [[7]]
## stat_econ stat_loc stat_glob
## stat_econ 1.00000000 0.2729433 -0.02274032
## stat_loc 0.27294326 1.0000000 0.12066396
## stat_glob -0.02274032 0.1206640 1.00000000
##
## [[8]]
## stat_econ stat_loc stat_glob
## stat_econ 1.0000000 0.2886127 -0.4142538
## stat_loc 0.2886127 1.0000000 -0.1251715
## stat_glob -0.4142538 -0.1251715 1.0000000
##
## [[9]]
## stat_econ stat_loc stat_glob
## stat_econ 1.0000000 0.1882623 -0.2694531
## stat_loc 0.1882623 1.0000000 -0.0700509
## stat_glob -0.2694531 -0.0700509 1.0000000
##
## [[10]]
## stat_econ stat_loc stat_glob
## stat_econ 1.0000000 0.1083463 -0.3616778
## stat_loc 0.1083463 1.0000000 0.0190826
## stat_glob -0.3616778 0.0190826 1.0000000
Merge the five “common” languages that would work OK (Hindi, Malayalam, Venda, Ndebele, and Kabardian)
stats %>%
dplyr::filter(hin == 1 | mal == 1 | ven == 1 | nbl == 1 | kbd == 1) %>%
plotly::plot_ly(x=~stat_econ,
y=~stat_loc,
z=~stat_glob,
text=~labels,
color=~log(freq),
type="scatter3d", mode="text") %>%
plotly::add_markers()
## Warning: textfont.color doesn't (yet) support data arrays
## Warning: textfont.color doesn't (yet) support data arrays
Hindi by itself
stats %>%
dplyr::filter(hin == 1) %>%
plotly::plot_ly(x=~stat_econ,
y=~stat_loc,
z=~stat_glob,
text=~labels,
color=~log(freq),
type="scatter3d", mode="text") %>%
plotly::add_markers()
## Warning: textfont.color doesn't (yet) support data arrays
## Warning: textfont.color doesn't (yet) support data arrays
write.csv(stats$labels,file="labels.csv")
Kabardian by itself
stats %>%
dplyr::filter(kbd == 1) %>%
plotly::plot_ly(x=~stat_econ,
y=~stat_loc,
z=~stat_glob,
text=~labels,
color=~log(freq),
type="scatter3d", mode="text") %>%
plotly::add_markers()
## Warning: textfont.color doesn't (yet) support data arrays
## Warning: textfont.color doesn't (yet) support data arrays
Kabardian prime by itself
stats %>%
dplyr::filter(kbd_prime == 1) %>%
plotly::plot_ly(x=~stat_econ,
y=~stat_loc,
z=~stat_glob,
text=~labels,
color=~log(freq),
type="scatter3d", mode="text") %>%
plotly::add_markers()
## Warning: textfont.color doesn't (yet) support data arrays
## Warning: textfont.color doesn't (yet) support data arrays
Kabardian prime and hindi
stats %>%
dplyr::filter(kbd_prime == 1|hin ==1) %>%
plotly::plot_ly(x=~stat_econ,
y=~stat_loc,
z=~stat_glob,
text=~labels,
color=~log(freq),
type="scatter3d", mode="text") %>%
plotly::add_markers()
## Warning: textfont.color doesn't (yet) support data arrays
## Warning: textfont.color doesn't (yet) support data arrays
Hindi and Kabardian
stats %>%
dplyr::filter(hin == 1 | kbd == 1) %>%
plotly::plot_ly(x=~stat_econ,
y=~stat_loc,
z=~stat_glob,
text=~labels,
color=~log(freq),
type="scatter3d", mode="text") %>%
plotly::add_markers()
## Warning: textfont.color doesn't (yet) support data arrays
## Warning: textfont.color doesn't (yet) support data arrays
Hindi and Malayalam
stats %>%
dplyr::filter(hin == 1 | mal == 1) %>%
plotly::plot_ly(x=~stat_econ,
y=~stat_loc,
z=~stat_glob,
text=~labels,
color=~log(freq),
type="scatter3d", mode="text") %>%
plotly::add_markers()
## Warning: textfont.color doesn't (yet) support data arrays
## Warning: textfont.color doesn't (yet) support data arrays
Hindi, Malayalam, and Georgian
stats %>%
dplyr::filter(hin == 1 | kat == 1 | mal == 1) %>%
plotly::plot_ly(x=~stat_econ,
y=~stat_loc,
z=~stat_glob,
text=~labels,
color=~log(freq),
type="scatter3d", mode="text") %>%
plotly::add_markers()
## Warning: textfont.color doesn't (yet) support data arrays
## Warning: textfont.color doesn't (yet) support data arrays